In [1]:
import pandas as pd
import numpy as nm
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv("corona.csv")
df.head()
Out[1]:
Id Location Weekly Cases Year Weekly Cases per Million Weekly Deaths Weekly Deaths per Million Total Vaccinations People Vaccinated People Fully Vaccinated Total Boosters Daily Vaccinations Total Vaccinations per Hundred People Vaccinated per Hundred People Fully Vaccinated per Hundred Total Boosters per Hundred Daily Vaccinations per Hundred Daily People Vaccinated Daily People Vaccinated per Hundred Next Week's Deaths
0 911530868 World 2372.0 2020 0.300 65.0 0.008 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 344
1 807936902 World 5023.0 2020 0.635 114.0 0.014 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 361
2 773590408 World 5612.0 2020 0.710 116.0 0.015 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 431
3 130466459 World 7580.0 2020 0.958 153.0 0.019 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 463
4 544040446 World 8983.0 2020 1.136 187.0 0.024 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 506
In [2]:
print(df.shape)
df.isnull().sum()
(129156, 20)
Out[2]:
Id                                          0
Location                                    0
Weekly Cases                              213
Year                                        0
Weekly Cases per Million                  868
Weekly Deaths                            1258
Weekly Deaths per Million                1909
Total Vaccinations                      88886
People Vaccinated                       90727
People Fully Vaccinated                 92582
Total Boosters                         109309
Daily Vaccinations                      51316
Total Vaccinations per Hundred          88886
People Vaccinated per Hundred           90727
People Fully Vaccinated per Hundred     92582
Total Boosters per Hundred             109309
Daily Vaccinations per Hundred          51316
Daily People Vaccinated                 51765
Daily People Vaccinated per Hundred     51765
Next Week's Deaths                          0
dtype: int64
In [3]:
df["Total Vaccinations"]=(df["People Vaccinated"]+df["People Fully Vaccinated"])
df["Total Vaccinations"].isnull().sum()
df["Total Vaccinations"].fillna("0")
df.dropna(inplace=True)
print(df.shape)
df.isnull().sum()
(18728, 20)
Out[3]:
Id                                     0
Location                               0
Weekly Cases                           0
Year                                   0
Weekly Cases per Million               0
Weekly Deaths                          0
Weekly Deaths per Million              0
Total Vaccinations                     0
People Vaccinated                      0
People Fully Vaccinated                0
Total Boosters                         0
Daily Vaccinations                     0
Total Vaccinations per Hundred         0
People Vaccinated per Hundred          0
People Fully Vaccinated per Hundred    0
Total Boosters per Hundred             0
Daily Vaccinations per Hundred         0
Daily People Vaccinated                0
Daily People Vaccinated per Hundred    0
Next Week's Deaths                     0
dtype: int64
In [4]:
df["Weekly Cases per Million"]=df["Weekly Cases"]/1000000
df["Weekly Deaths per Million"]=df["Weekly Deaths"]/1000000
df["Total Boosters per Hundred"]=df["Total Boosters"]/100
df["Total Vaccinations per Hundred"]=df["Total Vaccinations"]/100
df["People Vaccinated per Hundred"]=df["People Vaccinated"]/100
df["People Fully Vaccinated per Hundred"]=df["People Fully Vaccinated"]/100
df["Daily Vaccinations per Hundred"]=df["Daily Vaccinations"]/100
df["Daily People Vaccinated per Hundred"]=df["Daily People Vaccinated"]/100
df.head()
Out[4]:
Id Location Weekly Cases Year Weekly Cases per Million Weekly Deaths Weekly Deaths per Million Total Vaccinations People Vaccinated People Fully Vaccinated Total Boosters Daily Vaccinations Total Vaccinations per Hundred People Vaccinated per Hundred People Fully Vaccinated per Hundred Total Boosters per Hundred Daily Vaccinations per Hundred Daily People Vaccinated Daily People Vaccinated per Hundred Next Week's Deaths
241 275164452 World 4174523.0 2020 4.174523 77527.0 0.077527 7276178.0 7231498.0 44680.0 1.0 897447.0 72761.78 72314.98 446.80 0.01 8974.47 690726.0 6907.26 81042
242 857254713 World 4424216.0 2021 4.424216 79456.0 0.079456 9109346.0 9050886.0 58460.0 9.0 1079269.0 91093.46 90508.86 584.60 0.09 10792.69 735617.0 7356.17 92754
243 515683834 World 4553174.0 2021 4.553174 80332.0 0.080332 11535235.0 11343354.0 191881.0 15.0 1303377.0 115352.35 113433.54 1918.81 0.15 13033.77 851085.0 8510.85 94477
244 725478352 World 4619286.0 2021 4.619286 79640.0 0.079640 12944964.0 12578084.0 366880.0 23.0 1397939.0 129449.64 125780.84 3668.80 0.23 13979.39 845521.0 8455.21 96212
245 844503137 World 4649535.0 2021 4.649535 81042.0 0.081042 14652786.0 14002427.0 650359.0 27.0 1581369.0 146527.86 140024.27 6503.59 0.27 15813.69 928498.0 9284.98 96742
In [5]:
df.drop("Id",axis=1,inplace=True)
df.drop("Location",axis=1,inplace=True)
df.columns
Out[5]:
Index(['Weekly Cases', 'Year', 'Weekly Cases per Million', 'Weekly Deaths',
       'Weekly Deaths per Million', 'Total Vaccinations', 'People Vaccinated',
       'People Fully Vaccinated', 'Total Boosters', 'Daily Vaccinations',
       'Total Vaccinations per Hundred', 'People Vaccinated per Hundred',
       'People Fully Vaccinated per Hundred', 'Total Boosters per Hundred',
       'Daily Vaccinations per Hundred', 'Daily People Vaccinated',
       'Daily People Vaccinated per Hundred', 'Next Week's Deaths'],
      dtype='object')
In [6]:
df.drop("Year",axis=1,inplace=True)
import seaborn as sns
sns.pairplot(data=df, diag_kind='kde')
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
Out[6]:
<seaborn.axisgrid.PairGrid at 0x2260abef7d0>
In [7]:
sns.heatmap(df[['Weekly Cases', 'Weekly Deaths', 'Total Vaccinations', 'People Vaccinated',
       'People Fully Vaccinated', 'Total Boosters', 'Daily Vaccinations', 'Daily People Vaccinated', "Next Week's Deaths"]].corr(), cmap='Blues', annot=True)
plt.show()
In [13]:
#removing/reducing the regressor attributes that have less correlation to the response variable
df.drop("Total Boosters",axis=1,inplace=True)
df.drop("People Fully Vaccinated",axis=1,inplace=True)
df.drop("People Vaccinated",axis=1,inplace=True)
df.drop("Total Vaccinations",axis=1,inplace=True)
x=df[['Weekly Cases', 'Weekly Deaths','Daily Vaccinations', 'Daily People Vaccinated']].fillna("0")
y=df["Next Week's Deaths"]
x,y
Out[13]:
(        Weekly Cases  Weekly Deaths  Daily Vaccinations  \
 241        4174523.0        77527.0            897447.0   
 242        4424216.0        79456.0           1079269.0   
 243        4553174.0        80332.0           1303377.0   
 244        4619286.0        79640.0           1397939.0   
 245        4649535.0        81042.0           1581369.0   
 ...              ...            ...                 ...   
 129149         554.0           15.0              7129.0   
 129151         464.0           13.0              5665.0   
 129152         471.0           12.0              5295.0   
 129154         277.0            6.0              5358.0   
 129155         277.0            6.0              6190.0   
 
         Daily People Vaccinated  
 241                    690726.0  
 242                    735617.0  
 243                    851085.0  
 244                    845521.0  
 245                    928498.0  
 ...                         ...  
 129149                   2177.0  
 129151                   1427.0  
 129152                   1362.0  
 129154                   1633.0  
 129155                   2102.0  
 
 [18728 rows x 4 columns],
 241       81042
 242       92754
 243       94477
 244       96212
 245       96742
           ...  
 129149        6
 129151        9
 129152        7
 129154        7
 129155        8
 Name: Next Week's Deaths, Length: 18728, dtype: int64)
In [14]:
x.columns
Out[14]:
Index(['Weekly Cases', 'Weekly Deaths', 'Daily Vaccinations',
       'Daily People Vaccinated'],
      dtype='object')
In [15]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)
#y=sc.transform(y)
x,y
Out[15]:
(array([[ 2.36390907,  6.5447199 , -0.13662119,  0.05656583],
        [ 2.5245435 ,  6.71682839, -0.099713  ,  0.07874118],
        [ 2.60750575,  6.79498652, -0.05422114,  0.13578029],
        ...,
        [-0.32137423, -0.37129315, -0.31771983, -0.28396757],
        [-0.32149904, -0.37182847, -0.31770705, -0.2838337 ],
        [-0.32149904, -0.37182847, -0.31753816, -0.28360203]]),
 241       81042
 242       92754
 243       94477
 244       96212
 245       96742
           ...  
 129149        6
 129151        9
 129152        7
 129154        7
 129155        8
 Name: Next Week's Deaths, Length: 18728, dtype: int64)
In [ ]:
 
In [16]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

from sklearn.linear_model import LinearRegression
lr=LinearRegression()
model=lr.fit(x_train,y_train)
y_pred=model.predict(x_test)


from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#Metrics to evaluate your model 
r2_score(y_test, y_pred)*100, mean_absolute_error(y_test, y_pred), nm.sqrt(mean_squared_error(y_test, y_pred))
Out[16]:
(98.91008558095014, 411.1428851158433, 1164.553738645361)
In [17]:
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()